library(ranger)
library(caret)
library(yardstick)
library(doParallel)
library(MLmetrics)
library(readr)

#set working directory and randomization seed
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
set.seed(65)

#Start by reading in the test data and the train data and combining them
#Create an indicator list for the set of values that go in the test data

#read in the two files 
fl_train <- read_csv("data/fl_train_60.csv")
fl_test <- read_csv("data/fl_test_20.csv")

#bind the two together
fl_combined <- fl_train %>%
  bind_rows(fl_test)

#create a vector to index the observations in the training data
train_list <- list(as.integer(1:(nrow(fl_train))))
save(train_list, file = "train_list.Rdata")

#TABLE A.2
#Determine class weights (this is the square root of the inverted proportions, because the inverted proportions were overweighting the small cases too much)
#Proportions of race (column 1 of Table A.2) is based on the total Florida population based on 2018 census data
asian.weight =  sqrt(1/.0277)
black.weight =  sqrt(1/.1527)
hispanic.weight = sqrt(1/.2612)
other.weight =  sqrt(1/.0255)
white.weight =  sqrt(1/0.5326)

#Define the F1 function to use as the metric for hyperparameter tuning
f1 <- function(data, lev = NULL, model = NULL) {
  f1_val <- yardstick::f_meas(data = data, truth = obs, estimate = pred, estimator = "macro")
  b1 <- c("F1" = f1_val$.estimate)
  a1 <- defaultSummary(data, lev, model)
  out <- c(b1, a1)
  out
}

#####run a set of random forests from the #ranger package, including a grid search over hyperparameters
#include a timer
ptm <- proc.time()

#change the "16" to the number of clusters that is available to train the model
cl <- makePSOCKcluster(16, outfile = "workerlog.txt")
registerDoParallel(cl)

#set up the set of hyperparameters to optimize
#You can pick whatever level of mtry, min.node.size, and whichever splitrules are desired for the particular application.
#Fewer will reduce computing time, more will improve the likelihood of finding the optimal hyperparameters for the model
#TABLE A.3
grid <- expand.grid(mtry = c(2, 3, 4),
                    splitrule = c("gini", "extratrees"),
                    min.node.size = c(10, 15, 20, 25))

#set the terms of the hyperparameter search, this will optimize based on the f1 function defined earlier in this script
fitControl <- trainControl(method = "CV",
                           number = 1,
                           verboseIter = FALSE,
                           summaryFunction = f1,
                           returnData = FALSE,
                           returnResamp = "none",
                           trim = TRUE,
                           index = train_list)

#Run the random forest, using the grid search parameters just defined, and the class weights
fit_classweights = train(
  x = fl_combined[ , names(fl_combined) != 'sr.race'],
  class.weights = c(asian.weight, black.weight, hispanic.weight, other.weight, white.weight),
  y = fl_combined[ , names(fl_combined) == 'sr.race'],
  method ='ranger',
  num.trees = 250,
  metric = "F1",
  tuneGrid = grid,
  trControl = fitControl
)

#close out the parallel processing
stopCluster(cl)
#end the timer
proc.time() - ptm
#print the results of the random forest training
print(fit_classweights)

#save the random forest training results
save(fit_classweights, file = "random_forest_model_object.Rdata")

